import json
import pandas as pd
import jieba
import re
import jieba.posseg as psg


def chinese_word_cut(text):
    jieba.load_userdict("../assist_data/add_word_list.txt")
    jieba.initialize()
    try:
        stopword_list = open("../assist_data/stop_word_list.txt", encoding='utf-8')
    except FileNotFoundError as e:
        stopword_list = []
        print("error in stop_file")

    stop_list = []
    flag_list = ['n', 'nz', 'vn']
    for word in stopword_list:
        word = re.sub(u'\n|\r', '', word)
        stop_list.append(word)

    word_list = []
    # jieba分词
    seg_list = psg.cut(text)
    for seg_word in seg_list:
        word = re.sub(u'[^\u4e00-\u9fa5]', '', seg_word.word)
        find = 0
        for stop_word in stop_list:
            if stop_word == word or len(word) < 2:  # this word is stopword
                find = 1
                break
        if find == 0 and seg_word.flag in flag_list:
            word_list.append(word)
    return " ".join(word_list)


def read_json():
    json_lines = open("../data/search_spider.jsonl", encoding="UTF-8").readlines()

    data = {"time": [], "content": []}

    for line in json_lines:
        tweet = json.loads(line)
        data["time"].append(tweet["created_at"])
        data["content"].append(tweet["content"])

    data = pd.DataFrame(data)

    return data


if __name__ == '__main__':
    data = read_json()
    print(data)
    data["tokenized"] = data.content.apply(chinese_word_cut)
    print(data)


